
# Packages
library(tidyverse)
library(tidytext)
library(udpipe)
library(quanteda)
library(stm)
library(stringr)
library(readxl)
library(tidystm)
library(scales)

# ----------------------------------------------------------
# Load data
# NOTE: Adjust the file paths below to match where you downloaded the data files.
# ----------------------------------------------------------
speech_lee <- read_csv("Text data_Lee.csv")
speech_yoon <- read_csv("Text data_Yoon.csv")
news_data <- read_excel("news_crawling.xlsx")


# Merge texts
speech_lee$text <- speech_lee$remarks
speech_yoon$text <- speech_yoon$remarks
news_data$text <- paste(news_data$keyword, news_data$title, sep = " ")
speech_lee$source <- "speech"; speech_yoon$source <- "speech"; news_data$source <- "news"
combined_data <- bind_rows(
  speech_lee %>% select(text, source),
  speech_yoon %>% select(text, source),
  news_data %>% select(text, source)
)

# UDPIPE preprocessing (kept identical logic)
ud_model <- udpipe_load_model(file = "korean-gsd-ud-2.5-191206.udpipe")
kor_pre <- function(text) {
  meaningless_stopwords <- c(
    '제가','우리','그런','이런','이거','그래서','지금','없는','있는','저도','말씀','정말','어떻게','하는','하고','하면','해서',
    '되나','된다','합니까','아닙니까','않습니까','그게','그건','글쎄','보세요','주시죠','생각합니다','그럼요','때문에','그거',
    '드린','드리면서','것은','것이','것도','께서','때는','것을','이거를','이걸','하기','답을','이날','만큼','저희가','예를',
    '께서','님이','주장','생각','본의','질문','분께','정도','말씀을','얘기를','이야기를','그걸','생각이','것이기','들의',
    '님의','에게','모두를','정도','대답을','여러분','들이','그거는','님은','주의','께서는','굳이','라마다','그때','동의',
    '가지','사람이','말씀은','거를','그렇기','오전','얘기는','일반','이건','그거를','말씀하십시오','관련','본인이',"있기",
    '냈어','자체를'
  )
  anno <- tryCatch(udpipe_annotate(ud_model, x = text), error = function(e) NULL)
  if (is.null(anno)) return("")
  anno_df <- as.data.frame(anno)
  cw <- anno_df %>%
    dplyr::filter(upos %in% c("NOUN","PROPN")) %>%
    dplyr::pull(token) %>%
    str_remove_all('[[:punct:]]+|~') %>%
    str_remove_all('[0-9]+[가-힣%]+|[0-9]') %>%
    str_remove_all('들$') %>%
    str_remove_all('후보|대선|선거|국민|모럴센스|총신|번째|티브이|토론서|법정|주관|사회분야|마지막|총평') %>%
    str_replace_all('여가부','여성가족부') %>%
    str_replace_all('양극','양극화') %>%
    str_replace_all('빚','부채') %>%
    str_replace_all('토론회','토론') %>%
    str_replace_all('감염병','코로나') %>%
    str_replace_all('의힘','국민의힘') %>%
    str_replace_all('정부가','정부') %>%
    str_replace_all('문제는','문제') %>%
    str_replace_all('문제가','문제') %>%
    .[!. %in% meaningless_stopwords] %>%
    .[nchar(.) > 1]
  if (length(cw) == 0) return("")
  paste(cw, collapse = " ")
}
combined_data <- combined_data %>% mutate(text_clean = purrr::map_chr(text, kor_pre))

# STM input
mypreprocess <- textProcessor(
  combined_data$text_clean, metadata = combined_data,
  lowercase = FALSE, removestopwords = FALSE, removenumbers = FALSE,
  removepunctuation = FALSE, stem = FALSE, wordLengths = c(2, Inf)
)
kor_out <- prepDocuments(mypreprocess$documents, mypreprocess$vocab, mypreprocess$meta, lower.thresh = 0)

# Figure A4: diagnostics (searchK)
set.seed(20220804)
kresult <- searchK(kor_out$documents, kor_out$vocab, K = seq(5, 20, 3), data = kor_out$meta)
plot(kresult)

# STM K=10 (Table A3)
korstm <- stm(kor_out$documents, kor_out$vocab, K = 10, data = kor_out$meta,
              seed = 20220804, init.type = "Spectral")
summary(korstm)
labelTopics(korstm, topics = 1:10, n = 10)

t_labels <- c(
  "Covid-19 and Economy", "Basic Income", "Policy for the Disabled", "Security",
  "Welfare and Social Issues", "Lee’s Remarks on Zelensky", "Daejang-dong Scandal",
  "Political Reform", "Lee’s Remark on Key Currency", "Minor Parties"
)

par(family = "AppleGothic")
plot(korstm, type = "summary", custom.labels = t_labels)

# Effect: speech vs news
set.seed(20220804)
effect_result <- estimateEffect(1:10 ~ source, nsims = 100, model = korstm, metadata = kor_out$meta)
summary(effect_result)

# Figure A5: Speech − News differences
STM_diff <- extract.estimateEffect(
  effect_result, covariate = "source", method = "difference",
  cov.value1 = "speech", cov.value2 = "news", model = korstm
)

STM_diff %>%
  ggplot(aes(x = topic, y = estimate)) +
  geom_point(size = 2) +
  geom_errorbar(aes(ymin = ci.lower, ymax = ci.upper), width = 0.2) +
  geom_hline(yintercept = 0, color = "red", linetype = "dotted") +
  theme_bw() +
  scale_x_continuous(breaks = 1:10, labels = t_labels) +
  labs(title = "Speech - News Topic Differences",
       x = "Topic", y = "Differences (Speech - News)") +
  coord_flip()
